{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/paraphrase/parasci/test.src.truecase.translated\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/paraphrase/parasci/test.tgt.truecase.translated\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/paraphrase/parasci/test.src.1.truecase.translated\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/paraphrase/parasci/test.tgt.1.truecase.translated"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = '3'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import sentencepiece as spm\n",
    "sp_model = spm.SentencePieceProcessor()\n",
    "sp_model.Load('prepare/sp10m.cased.ms-en.model')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "import tensorflow_text\n",
    "import struct\n",
    "\n",
    "unknown = b'\\xff\\xff\\xff\\xff'\n",
    "\n",
    "def load_graph(frozen_graph_filename):\n",
    "    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:\n",
    "        graph_def = tf.GraphDef()\n",
    "        graph_def.ParseFromString(f.read())\n",
    "        \n",
    "    for node in graph_def.node:\n",
    "        \n",
    "        if node.op == 'RefSwitch':\n",
    "          node.op = 'Switch'\n",
    "          for index in xrange(len(node.input)):\n",
    "            if 'moving_' in node.input[index]:\n",
    "              node.input[index] = node.input[index] + '/read'\n",
    "        elif node.op == 'AssignSub':\n",
    "          node.op = 'Sub'\n",
    "          if 'use_locking' in node.attr: del node.attr['use_locking']\n",
    "        elif node.op == 'AssignAdd':\n",
    "          node.op = 'Add'\n",
    "          if 'use_locking' in node.attr: del node.attr['use_locking']\n",
    "        elif node.op == 'Assign':\n",
    "          node.op = 'Identity'\n",
    "          if 'use_locking' in node.attr: del node.attr['use_locking']\n",
    "          if 'validate_shape' in node.attr: del node.attr['validate_shape']\n",
    "          if len(node.input) == 2:\n",
    "            node.input[0] = node.input[1]\n",
    "            del node.input[1]\n",
    "            \n",
    "        if 'Reshape/shape' in node.name or 'Reshape_1/shape' in node.name:\n",
    "            b = node.attr['value'].tensor.tensor_content\n",
    "            arr_int = [int.from_bytes(b[i:i + 4], 'little') for i in range(0, len(b), 4)]\n",
    "            if len(arr_int):\n",
    "                arr_byte = [unknown] + [struct.pack('<i', i) for i in arr_int[1:]]\n",
    "                arr_byte = b''.join(arr_byte)\n",
    "                node.attr['value'].tensor.tensor_content = arr_byte\n",
    "            \n",
    "            if len(node.attr['value'].tensor.int_val):\n",
    "                node.attr['value'].tensor.int_val[0] = -1\n",
    "    \n",
    "    with tf.Graph().as_default() as graph:\n",
    "        tf.import_graph_def(graph_def)\n",
    "    return graph"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "parasci_src = ['test.src.truecase.translated', \n",
    "               'test.src.1.truecase.translated']\n",
    "parasci_tgt = ['test.tgt.truecase.translated', \n",
    "               'test.tgt.1.truecase.translated']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4690 4690\n",
      "5098 5098\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "\n",
    "X, Y = [], []\n",
    "directory = './'\n",
    "for i in range(len(parasci_src)):\n",
    "    file_src = os.path.join(directory, parasci_src[i])\n",
    "    file_tgt = os.path.join(directory, parasci_tgt[i])\n",
    "    with open(file_src) as fopen:\n",
    "        left = fopen.read().split('\\n')\n",
    "    with open(file_tgt) as fopen:\n",
    "        right = fopen.read().split('\\n')\n",
    "    if len(left) < len(right):\n",
    "        right = right[:len(left)]\n",
    "    else:\n",
    "        left = left[:len(right)]\n",
    "        \n",
    "    x, y = [], []\n",
    "    for k in range(len(left)):\n",
    "        if len(left[k]) and len(right[k]):\n",
    "            l_left = json.loads(left[k])\n",
    "            l_right = json.loads(right[k])\n",
    "            x.append(l_left['ms'])\n",
    "            y.append(l_right['ms'])\n",
    "            x.append(l_right['ms'])\n",
    "            y.append(l_left['ms'])\n",
    "            \n",
    "    print(len(x), len(y))\n",
    "    X.extend(x)\n",
    "    Y.extend(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "g = load_graph('small/frozen_model.pb')\n",
    "x = g.get_tensor_by_name('import/inputs:0')\n",
    "logits = g.get_tensor_by_name('import/SelectV2_3:0')\n",
    "test_sess = tf.InteractiveSession(graph = g)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 979/979 [12:53<00:00,  1.27it/s]\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "batch_size = 10\n",
    "results = []\n",
    "for i in tqdm(range(0, len(X), batch_size)):\n",
    "    batch_x = X[i: i + batch_size]\n",
    "    batches = []\n",
    "    for b in batch_x:\n",
    "        batches.append(f'parafrasa: {b}')\n",
    "    g = test_sess.run(logits, feed_dict = {x:batches})\n",
    "    results.extend(g.tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('Kami menjalankan Mecab pada Hadoop 11, perisian sumber terbuka yang melaksanakan rangka kerja Map-Reduce, untuk perkataan Segmenting dan Pos yang menandai data.',\n",
       " 'Kami menjalankan Mecab 4 dengan kamus Ipa 5 pada Hadoop 6, perisian sumber terbuka yang melaksanakan rangka kerja Map-Reduce, untuk pensijilan kata selari, penandaan Part-Of-Speech, dan Kana Pronunciation Annotating.')"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "results_Y = [sp_model.DecodeIds(r) for r in results]\n",
    "results_Y[0], Y[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tensor2tensor.utils import bleu_hook"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.6174561"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "bleu_hook.compute_bleu(reference_corpus = Y, \n",
    "                       translation_corpus = results_Y)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
